\begin{table}[!h]

\caption{\label{tab:uk-manifestos_5x5-crossval_deberta-finetuning_testset_by_cat}Summary of test set performances in terms of the F1 score of DeBERTa group mention detection classifiers fine-tuned and evaluated on our corpus of labeled UK manifesto sentences. Values  (in brackets) report the average (90\% quantile range) of performances of 25 different classifiers fine-tuned in a 5-times repeated 5-fold cross-validation scheme. Rows report results for the different group categeries included in our coding scheme. The ``micro'' metric reports results when treating different group types as one. Columns distinguish between different evaluation schemes (i.e., different ways to compute the F1 score). \emph{Note:} \texttt{seqeval} is the strict metric proposed by \citet{ramshaw_text_1995} and implemented by \citet{nakayama_seqeval_2018}.}
\centering
\fontsize{10}{12}\selectfont
\begin{tabular}[t]{lcccc}
\toprule
\multicolumn{1}{c}{ } & \multicolumn{3}{c}{Mention level} & \multicolumn{1}{c}{ } \\
\cmidrule(l{3pt}r{3pt}){2-4}
Category & \texttt{seqeval} & cross-span avg. & within-sentence avg. & Sentence level\\
\midrule
micro & 0.82 [0.76, 0.87] & 0.88 [0.85, 0.92] & 0.88 [0.84, 0.91] & 0.97 [0.96, 0.98]\\
SG & 0.83 [0.77, 0.90] & 0.87 [0.81, 0.92] & 0.88 [0.79, 0.94] & 0.95 [0.92, 0.97]\\
PG & 0.90 [0.86, 0.95] & 0.93 [0.90, 0.97] & 0.93 [0.85, 0.97] & 0.98 [0.96, 0.99]\\
PI & 0.81 [0.75, 0.87] & 0.81 [0.74, 0.88] & 0.82 [0.75, 0.88] & 0.92 [0.88, 0.95]\\
ORG & 0.80 [0.71, 0.88] & 0.80 [0.70, 0.87] & 0.82 [0.72, 0.89] & 0.92 [0.89, 0.94]\\
ISG & 0.74 [0.66, 0.82] & 0.67 [0.60, 0.75] & 0.70 [0.61, 0.80] & 0.83 [0.77, 0.87]\\
\bottomrule
\end{tabular}
\end{table}
